View Javadoc

1   
2   /*
3    * SmartCrawler
4    *
5    * $Id: ContainedWordFilter.java,v 1.3 2005/08/05 15:55:53 vincool Exp $
6    * Copyright 2005 Davide Pozza
7    *
8    * This program is free software; you can redistribute it
9    * and/or modify it under the terms of the GNU General Public
10   * License as published by the Free Software Foundation;
11   * either version 2 of the License, or (at your option) any
12   * later version.
13   *
14   * This program is distributed in the hope that it will be
15   * useful, but WITHOUT ANY WARRANTY; without even the implied
16   * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
17   * PURPOSE. See the GNU General Public License for more
18   * details.
19   *
20   * You should have received a copy of the GNU General Public
21   * License along with this program; if not, write to the Free
22   * Software Foundation, Inc., 59 Temple Place, Suite 330,
23   * Boston, MA 02111-1307 USA
24   *
25   */
26  
27  package org.smartcrawler.filter;
28  import java.nio.ByteBuffer;
29  import java.nio.CharBuffer;
30  import java.nio.charset.Charset;
31  import java.util.regex.Matcher;
32  import java.util.regex.Pattern;
33  import org.apache.log4j.Logger;
34  import org.smartcrawler.common.AbstractParametrizableComponent;
35  import org.smartcrawler.common.Context;
36  import org.smartcrawler.common.SCLogger;
37  import org.smartcrawler.retriever.Content;
38  
39  /***
40   *
41   *
42   * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
43   * @version <tt>$Revision: 1.3 $</tt>
44   */
45  public class ContainedWordFilter extends AbstractParametrizableComponent implements PostFilterLink {
46  
47      private static Logger log = SCLogger.getLogger(ContainedWordFilter.class);
48  
49  
50      /***
51       *
52       * @param link
53       * @return
54       */
55      public boolean isPermitted(Context conf, Content content) {
56          String keyword = getParameter("keyword");
57          log.debug("isPermitted() BEGIN [keyword=" + keyword + "]");
58  
59          boolean res = false;
60  
61          boolean isHtml = content.getContentType().indexOf("htm") >= 0;
62  
63          if (isHtml) {
64              try {
65                  Pattern p =
66                          Pattern.compile("(<[^<]*>([^<]*)<[^<]*>)",
67                          Pattern.MULTILINE |
68                          Pattern.CASE_INSENSITIVE |
69                          Pattern.DOTALL);
70                  byte[] buffer = content.getBuffer();
71                  ByteBuffer bbuf = ByteBuffer.allocate(buffer.length);
72                  bbuf.put(buffer);
73                  bbuf.flip();
74                  CharBuffer charBuf =
75                          Charset.forName("8859_1").newDecoder().decode(bbuf);
76                  Matcher matcher = p.matcher(charBuf);
77                  while (matcher.find()) {
78                      CharSequence cs = matcher.group(2);
79                      //CharSequence cs = m.group(2);
80                      if (cs.toString().trim().toLowerCase().contains(keyword)) {
81                          res = true;
82                          break;
83                      }
84                  }
85              }catch (Exception e) {
86                  log.warn("Unable to apply filter on " + content.getLink());
87              }
88          }
89          log.debug("Checking content res=" + res);
90          log.debug("isPermitted() END");
91          return res;
92      }
93  }